suppressPackageStartupMessages({
import(rpkgs)
})
import(run)
import(util)
## [1] TRUE
import(perfplot)
modelName = "smacd my lgbm up"
trnAmt = 60 * 24 * 7 * 1 #
tstAmt = 60 * 24 * 7 * 2 # 2 weeks, submission period will provide new data every 2 weeks
assets = getAllAssets()
## 2021-12-03 00:35:24 INFO::Sourcing ALL_ASSETS
numSamples = 200
makeData = \(env, minDate, maxDate, assets, ...) {
selectStmt = glue('
WITH
t1 AS (
SELECT
ts
, asset_name
, target
, LAG(target, 15) OVER (PARTITION BY asset_id ORDER BY ts ASC)
AS target_lag_15
, REGR_SLOPE(vwap, EXTRACT(EPOCH FROM ts)::real)
OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 30 PRECEDING)
AS vwap_slope_30
, REGR_SLOPE(vwap, EXTRACT(EPOCH FROM ts)::real)
OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 7 PRECEDING)
AS vwap_slope_7
, AVG(close) OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 26 PRECEDING)
AS sma_close_26
, AVG(close) OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 12 PRECEDING)
AS sma_close_12
FROM trn
WHERE (ts BETWEEN $1 AND $2)
AND asset_id IN ({paste(assets, collapse = ", ")})
),
t2 AS (
SELECT
*
, sma_close_26 - sma_close_12
AS smacd_close
FROM t1
),
t3 AS (
SELECT
*
, REGR_SLOPE(smacd_close, EXTRACT(EPOCH FROM ts)::REAL)
OVER (PARTITION BY asset_name ORDER BY ts ASC ROWS 6 PRECEDING)
AS smacd_close_slope_6
FROM t2
)
SELECT * FROM t3
')
df = getQuery(selectStmt, params = list(minDate, maxDate))
env$info = data.table(
ts = df[,ts],
asset_name = df[,asset_name]
)
# df[,"ts_std"] <- as.integer(df[,ts])
df[,"ts"] <- NULL
df[,"asset_name"] <- NULL
env$y = df[,.(target)]
df[,"target"] <- NULL
env$x = df
}
trainModel = \(model, trn, tst, ...) {
# give the model a description
model$description = 'lgbm basic test'
model$nameOfStandardiserFor = \(name) glue('{name}_standardiser')
model$getStandardiser = \(name) model[[model$nameOfStandardiserFor(name)]]
# fit X standardisers
lapply(colnames(trn$x), \(name) {
model[[model$nameOfStandardiserFor(name)]] = as.standardiser(trn$x[[name]])
})
# fit y standardiser
model$target_standardiser = as.standardiser(trn$y[,target])
# define forward data transform
model$dataFwdTransform = \(env) {
# standardise Xs
lapply(colnames(env$x), \(name) {
env$x[[name]] = predict(model$getStandardiser(name), env$x[[name]])
})
# standardise the target
env$y[,"target"] = predict(model$target_standardiser, env$y[,target])
}
# fwdTranform the data
model$dataFwdTransform(trn)
model$dataFwdTransform(tst) # TODO should not be the same tst
trn = lgb.Dataset(
as.matrix(trn$x),
colnames = colnames(trn$x),
label = trn$y[,target],
init_score = rep(0, nrow(trn$x))
)
tst = lgb.Dataset(
as.matrix(tst$x),
colnames = colnames(tst$x),
label = tst$y[,target]
)
nrounds = 1000
# train the model
model$lgbm = lgb.train(
data = trn,
params = list(
num_threads = max(parallel::detectCores(FALSE) - 1, 1),
seed = 5280,
num_iterations = nrounds,
num_leaves = 31,
max_depth = 0,
learning_rate = 0.002,
boosting = "gbdt",
min_data_in_leaf = 3,
min_sum_hessian_in_leaf = 0.01,
lambda_l1 = 0.01,
lambda_l2 = 0.01
),
valids = list(
trn = trn,
tst = tst
),
obj = "regression_l1",
eval = c(
"l2"
),
record = TRUE,
early_stopping_rounds = 100
)
}
predictModel = \(model, tst, ...) {
# fwdTransform the data
# model$dataFwdTransform(tst)
# generate prediction
tst$yhat = predict(model$lgbm, as.matrix(tst$x))
}
set.seed(2057944)
for (i in 1:numSamples) {
results = doRun(
name = modelName,
trnAmt = trnAmt,
tstAmt = tstAmt,
assets = assets[3,asset_id],
makeData = makeData,
trainModel = trainModel,
predictModel = predictModel
)
}
These plots are just for the last run of the model.
set.seed(684280)
pd = results$tst$x
pd = cbind(pd, results$tst$info)
pd$y = results$tst$y$target
pd$yhat = results$tst$yhat
sanityCheckingPlot = \(plotType) {
# sample of data
numPointsForPlot = 300
plotStart = sample(pd[1:(nrow(pd) - numPointsForPlot),ts], 1)
plotEnd = plotStart + as.difftime(numPointsForPlot, units = "mins")
if (plotType == "features") {
pd[
asset_name == pd[sample(nrow(pd), 1),asset_name]
& ts > plotStart
& ts < plotEnd
] |>
melt(id.vars = c("ts", "asset_name")) |>
ggplot(aes(ts, value, colour = variable)) +
geom_line() +
facet_wrap(~ asset_name)
} else if (plotType == "forecast") {
pd[
asset_name == pd[sample(nrow(pd), 1),asset_name]
& ts > plotStart
& ts < plotEnd,
.(ts, asset_name, y, yhat)
] |>
melt(id.vars = c("ts", "asset_name")) |>
ggplot(aes(ts, value, colour = variable)) +
geom_line() +
facet_wrap(~ asset_name, ncol = 1)
}
}
plotAroundBiggest = \() {
maxy = max(pd[,y], na.rm = TRUE)
maxts = pd[y == maxy,ts]
maxasset = pd[y == maxy,asset_name]
plotStart = maxts - as.difftime(100, units = "mins")
plotEnd = maxts + as.difftime(100, units = "mins")
pd[
asset_name == maxasset
& ts > plotStart
& ts <= plotEnd,
.(ts, y, yhat)
] |>
melt(id.vars = "ts") |>
ggplot(aes(x = ts, y = value, colour = variable)) +
geom_line()
}
getEvaluationLog = \(lgbm) {
iteration = 1:lgbm$current_iter()
getResult = \(dataset, metric) {
data.table(
iteration = iteration,
dataset = dataset,
metric = metric,
value = lgb.get.eval.result(lgbm, dataset, metric)
)
}
eval_log = rbind(
getResult("trn", "l2"),
getResult("tst", "l2")
)
eval_log
}
The competition metric is correlation between your predictions and the targets.
Visualising this:
## Warning: Removed 3 rows containing non-finite values (stat_bin2d).
Remember, that’s just for 1 run; we repeated that experiment 200 times!
scores = getQuery('SELECT * FROM metrics WHERE name = $1', params = list(modelName))
#DT::datatable(scores[,.(run_id, corr, mae, aae, rmse)])
scores[,.(run_id, corr, mae, aae, rmse)]
p = performancePlot(scores[,corr])
print(p)
## performancePlot:
## count = 200
## mean = 0.03799060727235
## sd = 0.067167969602342
## sem = 0.00474949267843479
## range = 0.49512962
## binwidth = 0.0165043206666667
## relativePlot = <ggplot plot object>
## absolutePlot = <ggplot plot object>
## t.test =
## One Sample t-test
##
## data: x
## t = 7.9989, df = 199, p-value = 1.013e-13
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.02862481 0.04735640
## sample estimates:
## mean of x
## 0.03799061
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.77894, p-value = 4.366e-16